Load the requiered libraries for the analysis:
#library(nCov2019)
library(leaflet)
library(dplyr)
library(ggplot2)
library(plotly)
library(scales)
library(xts)
library(dygraphs)
library(corrplot)
library(lubridate)
library(fmsb)
library(forecast)
Load the given data, and the one updated from 2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE
COVID<-read.csv("covid_19_data.csv")
COVID_2<-read.csv("COVID19_14-Apr.csv")
Format date:
Date<-as.Date(COVID_2$Date, format="%m/%d/%y")
COVID_2$Date2<-Date
Filter to obtain the last date:
COVID_updated<-COVID_2 %>% filter(Date2==max(Date2))
World map cases:
leaflet(width = "100%") %>%
addProviderTiles("CartoDB.DarkMatter") %>%
setView(lng = 0, lat = 10, zoom = 1.5) %>%
addCircleMarkers(data = COVID_updated,
lng = ~ Long,
lat = ~ Lat,
radius = ~ log(Confirmed+1),
color = rgb(218/255,65/255,56/255),
fillOpacity = ~ ifelse(Confirmed > 0, 1, 0),
stroke = FALSE,
label = ~ paste(Province.State,",",Country.Region, ": ", Confirmed)
)
Current top 10 countries by cases:
COVID_top<-COVID_2 %>% filter(Date2==max(Date2)) %>%
group_by(Country.Region) %>% summarise(Total_confirmed=sum(Confirmed)) %>%
top_n(10,Total_confirmed) %>% arrange(desc(Total_confirmed))
plot<-ggplot(data=COVID_top
, aes(x=Total_confirmed,y=reorder(Country.Region,Total_confirmed))) +
geom_bar(stat ="identity",alpha=0.8,fill="firebrick3") +
geom_text(aes(label=Total_confirmed), vjust=0.5, hjust=0.9,color="black", size=3.5) +
scale_x_continuous(labels = comma) +
labs(title = paste("Top 10 countries with confirmed cases as of ",max(COVID_2$Date2)),
x = "Confirmed cases",
y = "Country") +
theme_minimal()
ggplotly(plot,tooltip = c("x"),width=750)
Time distribution of cases, deaths and recovered:
COVID_2_Day<- COVID_2 %>% group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed),
World_deaths=sum(Deaths),
World_recovered=sum(Recovered))
COVID_Day_confirmed_series<-xts(COVID_2_Day$World_confirmed, order.by=COVID_2_Day$Date2)
COVID_Day_deaths_series<-xts(COVID_2_Day$World_deaths, order.by=COVID_2_Day$Date2)
COVID_Day_recovered_series<-xts(COVID_2_Day$World_recovered, order.by=COVID_2_Day$Date2)
Day_summary<-cbind(COVID_Day_confirmed_series,COVID_Day_deaths_series,COVID_Day_recovered_series)
dygraph(Day_summary, main = "SARS-COV2-outbreak: Total worldwide cases",
xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_Day_confirmed_series", "Total cases",drawPoints = TRUE,
pointSize = 3, color=rgb(53/255,116/255,199/255)) %>%
dySeries("COVID_Day_deaths_series", "Total deaths",drawPoints = TRUE,
pointSize = 3, color=rgb(189/255,55/255,48/255)) %>%
dySeries("COVID_Day_recovered_series", "Total recovered",drawPoints = TRUE,
pointSize = 3, color=rgb(69/255,136/255,51/255)) %>%
dyRangeSelector()
Create a function to calculate new cases:
New_count<-function(x)
{
Daily_cases<-numeric(length(x))
for(i in length(x):2)
{
Daily_cases[i]=x[i] - x[i-1]
}
return(Daily_cases)
}
New_cases<-New_count(COVID_2_Day$World_confirmed)
New_deaths<-New_count(COVID_2_Day$World_deaths)
New_recovered<-New_count(COVID_2_Day$World_recovered)
COVID_New_confirmed_series<-xts(New_cases, order.by=COVID_2_Day$Date2)
COVID_New_deaths_series<-xts(New_deaths, order.by=COVID_2_Day$Date2)
COVID_New_recovered_series<-xts(New_recovered, order.by=COVID_2_Day$Date2)
New_summary<-cbind(COVID_New_confirmed_series,COVID_New_deaths_series,COVID_New_recovered_series)
New cases, deaths and recovered time distribution:
dygraph(New_summary, main = "SARS-COV2-outbreak: Total worldwide cases",
xlab="Date", ylab="Novel coronavirus cases",width = 750) %>%
dySeries("COVID_New_confirmed_series", "New cases",drawPoints = TRUE,
pointSize = 3, color=rgb(53/255,116/255,199/255)) %>%
dySeries("COVID_New_deaths_series", "New deaths",drawPoints = TRUE,
pointSize = 3, color=rgb(189/255,55/255,48/255)) %>%
dySeries("COVID_New_recovered_series", "New recovered",drawPoints = TRUE,
pointSize = 3, color=rgb(69/255,136/255,51/255)) %>%
dyRangeSelector()
Total cases by some countries time distribution:
COVID_2_Day_Lebanon<- COVID_2 %>%
filter(Country.Region %in% c("Lebanon")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Chile<- COVID_2 %>%
filter(Country.Region %in% c("Chile")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Colombia<- COVID_2 %>%
filter(Country.Region %in% c("Colombia")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_CostaRica<- COVID_2 %>%
filter(Country.Region %in% c("Costa Rica")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_Day_series_Lebanon<-xts(COVID_2_Day_Lebanon$World_confirmed, order.by=COVID_2_Day_Lebanon$Date2)
COVID_Day_series_Chile<-xts(COVID_2_Day_Chile$World_confirmed, order.by=COVID_2_Day_Chile$Date2)
COVID_Day_series_Colombia<-xts(COVID_2_Day_Colombia$World_confirmed, order.by=COVID_2_Day_Colombia$Date2)
COVID_Day_series_CostaRica<-xts(COVID_2_Day_CostaRica$World_confirmed, order.by=COVID_2_Day_CostaRica$Date2)
Our_Countries<-cbind(COVID_Day_series_Lebanon,COVID_Day_series_Chile,COVID_Day_series_Colombia,COVID_Day_series_CostaRica)
dygraph(Our_Countries, main = "SARS-COV2-outbreak: Total cases by country", xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_Day_series_Lebanon", "Lebanon",drawPoints = TRUE,
pointSize = 3, color=rgb(0,0,3/255)) %>%
dySeries("COVID_Day_series_Chile", "Chile",drawPoints = TRUE,
pointSize = 3,color=rgb(120/255,28/255,109/255)) %>%
dySeries("COVID_Day_series_Colombia", "Colombia",drawPoints = TRUE,
pointSize = 3,color=rgb(237/255,105/255,37/255)) %>%
dySeries("COVID_Day_series_CostaRica", "Costa Rica",drawPoints = TRUE,
pointSize = 3,color=rgb(204/255,197/255,126/255)) %>%
dyRangeSelector()
New cases by some countries time distribution:
New_Lebanon<-New_count(COVID_2_Day_Lebanon$World_confirmed)
New_Chile<-New_count(COVID_2_Day_Chile$World_confirmed)
New_Colombia<-New_count(COVID_2_Day_Colombia$World_confirmed)
New_CostaRica<-New_count(COVID_2_Day_CostaRica$World_confirmed)
COVID_New_series_Lebanon<-xts(New_Lebanon, order.by=COVID_2_Day_Lebanon$Date2)
COVID_New_series_Chile<-xts(New_Chile, order.by=COVID_2_Day_Chile$Date2)
COVID_New_series_Colombia<-xts(New_Colombia, order.by=COVID_2_Day_Colombia$Date2)
COVID_New_series_CostaRica<-xts(New_CostaRica, order.by=COVID_2_Day_CostaRica$Date2)
Our_New_Countries<-cbind(COVID_New_series_Lebanon,COVID_New_series_Chile,COVID_New_series_Colombia,COVID_New_series_CostaRica)
dygraph(Our_New_Countries, main = "SARS-COV2-outbreak: New cases by country", xlab="Date", ylab="Total cases",width = 750) %>%
dySeries("COVID_New_series_Lebanon", "Lebanon",drawPoints = TRUE,
pointSize = 3, color=rgb(0,0,3/255)) %>%
dySeries("COVID_New_series_Chile", "Chile",drawPoints = TRUE,
pointSize = 3,color=rgb(120/255,28/255,109/255)) %>%
dySeries("COVID_New_series_Colombia", "Colombia",drawPoints = TRUE,
pointSize = 3,color=rgb(237/255,105/255,37/255)) %>%
dySeries("COVID_New_series_CostaRica", "Costa Rica",drawPoints = TRUE,
pointSize = 3,color=rgb(204/255,197/255,126/255)) %>%
dyRangeSelector()
3D scatterplot of confirmed cases Vs. Deaths Vs. Recovered, using the data for the last day
fig <- plot_ly(COVID_updated, x = ~Confirmed, y = ~Deaths, z = ~Recovered, width=750) %>%
add_markers(text= ~Country.Region ,hoverinfo= "text",
marker = list(color=rgb(189/255,55/255,48/255))) %>%
layout(title="Confirmed cases Vs. Deaths Vs. Recovered", scene = list(
xaxis = list(title = 'Confirmed'),
yaxis = list(title = 'Deaths'),
zaxis = list(title = 'Recovered')))
fig
In this step, we will look for correlations of the number of cases of COVID-19, by some known indicators. Here each individual of analysis (tuple) is a country.
The Human Development Index is calculated by the United Nations. And its located here: United Nations Human Development reports
HDI<-read.csv("Human Development Index (HDI)_2.csv",sep=";",dec=",")
Group data by country:
COVID_Country<-COVID_2 %>% filter(Date2==max(Date2)) %>%
group_by(Country.Region) %>% summarise(Total_confirmed=sum(Confirmed),
Total_deaths=sum(Deaths),
Total_Recovered=sum(Recovered))
Remove after parentheses: Use regular expresions to clean the countries name on the HDI file, this to use this variable as key to join with the COVID-19 countries table:
HDI$Country_2<-gsub("\\s*\\([^\\)]+\\)","",as.character(HDI$Country))
HDI$Country_2[HDI$Country_2=="United States"]<-"US"
HDI$Country_2[HDI$Country_2=="Korea"]<-"South Korea"
Because the number of cases by country is clearly affected by the population (for example the number of cases in Lebanon is way smaller than the US, but Lebanon has 6M inhabitants and the US 330M), this variable was used to obtain the number of cases per million inhabitants.
The data used is from the World Bank indicators
Population<-read.csv("World_population.csv",sep=";",dec=",")
Remove after commma: Use regular expresions to clean the countries name on the population file, this to use this variable as key to join with the COVID-19 countries table:
Population$Country_Name_2<-gsub(",.*", "", as.character(Population$Country_Name))
Population$Country_Name_2[Population$Country_Name_2=="United States"]<-"US"
Population$Country_Name_2[Population$Country_Code=="KOR"]<-"South Korea"
Population$Country_Name_2[Population$Country_Code=="CZE"]<-"Czechia"
Natural Join:
COVID_3<- COVID_Country %>% inner_join(HDI,by=c("Country.Region"="Country_2")) %>%
inner_join(Population,by=c("Country.Region"="Country_Name_2")) %>%
select(Country.Region,Total_confirmed,Total_deaths,Total_Recovered,HDI_Rank_2018,Year_2018,
Country_Code,Population_2018) %>%
mutate(Cases_million=(Total_confirmed/Population_2018)*1000000,
Recovered_percentage=(Total_Recovered/Total_confirmed)*100)
COVID_3<-COVID_3[!is.na(COVID_3$Population_2018),]
Countries with top 10 cases per million inhabitants
COVID3_top<-COVID_3 %>% top_n(10,Cases_million) %>% arrange(desc(Cases_million)) %>%
mutate(Cases_million=round(Cases_million,0))
plot<-ggplot(data=COVID3_top
, aes(x=Cases_million,y=reorder(Country.Region,Cases_million))) +
geom_bar(stat ="identity",alpha=0.8,fill="sky blue", ) +
geom_text(aes(label=Cases_million), vjust=0.5, hjust=0.9,color="black", size=3.5) +
scale_x_continuous(labels = comma) +
labs(title = paste("Top 10 countries with confirmed cases per million habitant \n as of ",max(COVID_2$Date2)),
x = "Confirmed cases per million habitants",
y = "Country") +
theme_minimal()
ggplotly(plot,tooltip = c("x"),width=750)
Plot the Human Development Index(HDI) Vs. the number of cases (applying a log transformation, to avoid effects of outliers), and the proportion of recovered cases:
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Year_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(237/255,105/255,37/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="HDI Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="HDI")
ggplotly(plot,tooltip = c("text"),width=750)
Correlation matrix:
COVID_numeric_1<-COVID_3 %>% mutate(Log_cases=log(Cases_million),
Death_percentage=(Total_deaths/Total_confirmed)*100) %>%
select(Log_cases,Recovered_percentage,Death_percentage,Year_2018)
corrplot(cor(COVID_numeric_1),method = "number",tl.col="black",tl.srt=15,
col=colorRampPalette(c(rgb(204/255,197/255,126/255),rgb(237/255,105/255,37/255)))(200))
Measles<-read.csv("Measles_immunization.csv",sep=";",dec=".")
Measles$Country_2<-gsub("\\s*\\([^\\)]+\\)","",as.character(Measles$Country))
Measles$Country_2[Measles$Country_2=="United States"]<-"US"
Measles$Country_2[Measles$Country_2=="Korea"]<-"South Korea"
COVID_3<- COVID_3 %>% inner_join(Measles,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
The next variable to consider is the percentage of the GDP that the countries spend on health, which is also available on United Nations Human Development reports
Health_expenditure<-read.csv("Health_expenditure_GDP.csv",sep=";",dec=".")
Remove after parentheses: Use regular expresions to clean the countries name on the health expenditure file, this to use this variable as key to join with the COVID-19 countries table:
Health_expenditure$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(Health_expenditure$Country))
Health_expenditure$Country_2[Health_expenditure$Country_2=="United States"]<-"US"
Health_expenditure$Country_2[Health_expenditure$Country_2=="Korea"]<-"South Korea"
Natural join of tables:
COVID_3<- COVID_3 %>% inner_join(Health_expenditure,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
Plot the % of GDP spent on health Vs the Cases/1M population:
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Expenditure_2016,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(204/255,197/255,126/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="Health expenditure (% of GDP) Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="% of GDP in health")
ggplotly(plot,tooltip = c("text"),width=750)
Other variable to considered is the temperature of the countries. While the is no empirical evidence that the weather affects or helps the spread of the SARS-COV 2 virus. This paper covers how generally, the weather had affected the spread of viral respiratory infections
This data contains the temperature (in celcius) of the first day of the month, for all countries since 1900, and it is available on this Kaggle
Temperature<-read.csv("GlobalLandTemperaturesByCountry.csv",sep=";")
Format date:
Date_Temp<-as.Date(Temperature$dt, format="%d/%m/%Y")
Temperature$Date_Temp<-Date_Temp
Extract month and filter for IQ (Jan, Feb and Mar), because that are the months our COVID data is available, and obtain the average IQ temperature:
Temperature<- Temperature %>% mutate(Month=month(Temperature$Date_Temp,label =TRUE),
Year=year(Temperature$Date_Temp)) %>%
filter(Month %in% c("Jan","Feb","Mar") & Year>=2000) %>%
group_by(Country) %>% summarise(Avg_IQ_temperature=mean(AverageTemperature))
Temperature<-Temperature[!is.na(Temperature$Avg_IQ_temperature),]
Regular expressions to clean the countries name:
Temperature$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(Temperature$Country))
Temperature$Country_2[Temperature$Country_2=="United States"]<-"US"
Temperature$Country_2[Temperature$Country_2=="Korea"]<-"South Korea"
Join tables:
COVID_3<- COVID_3 %>% inner_join(Temperature,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
#write.csv(COVID_3,"COVID_Covariables.csv")
Plot the average temperature of the first quarter Vs. the cases/1M habitants:
plot<-ggplot(data=COVID_3,aes(x=log(Cases_million),y=Avg_IQ_temperature,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(120/255,28/255,109/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="Average IQ temperature Vs. logarithmus of COVID-19 cases by million inhabitants \n and proportion of recovered",
x="ln(Cases/1M population)",
y="Temperature (Celcius)")
ggplotly(plot,tooltip = c("text"),width=750)
This step is similar to the previous one, where each individual of analysis (tuple) is a country, but here we are considering the death rate percentage.
The DTP is a class of combination vaccines against three infectious diseases in humans: diphtheria, pertussis (whooping cough), and tetanus, the data is also available on the United Nations Human Development reports
DTP_immunization<-read.csv("Infants_lacking_immunization_DTP.csv",sep=";")
Remove after parentheses: regular expression cleaning
DTP_immunization$Country_2<-gsub("\\s*\\([^\\)]+\\)","",
as.character(DTP_immunization$Country))
DTP_immunization$Country_2[DTP_immunization$Country_2=="United States"]<-"US"
DTP_immunization$Country_2[DTP_immunization$Country_2=="Korea"]<-"South Korea"
Join tables:
COVID_4<- COVID_Country %>% inner_join(DTP_immunization,
by=c("Country.Region"="Country_2")) %>%
select(Country.Region,Total_confirmed,Total_deaths,Total_Recovered,
Lack_DTP_inmmunization_2018) %>%
mutate(Recovered_percentage=(Total_Recovered/Total_confirmed)*100,
Death_rate=(Total_deaths/Total_confirmed)*100)
Plot the death rate Vs. % of infants lacking DTP immunization:
plot<-ggplot(data=COVID_4,aes(x=Death_rate,y=Lack_DTP_inmmunization_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(120/255,28/255,109/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="% infants lacking DTP immunization Vs. death rate and \n proportion of recovered",
x="Novel coronavirus death rate",
y="% of infants")
ggplotly(plot,tooltip = c("text"),width=750)
The next vaccine that was evaluated was measles, the data is also available on the United Nations Human Development reports
COVID_4<- COVID_4 %>% inner_join(Measles,by=c("Country.Region"="Country_2")) %>% select(-c("Country"))
See the distribution of the % od infants lacking measles vaccine, the median of this variable is below 10%, with some outliers over 40% in some countries
ggplot(COVID_4, aes(y=Measles_2018)) +
geom_boxplot(fill="dodgerblue4",outlier.shape = 21,
outlier.fill = "firebrick",alpha=0.75) +
ggtitle("Boxplot of % infants lacking measles immunization") + ylab("% of infants") +
theme_minimal()
Histogram:
ggplot(COVID_4, aes(Measles_2018)) +
geom_histogram(fill="dodgerblue4",bins=20,alpha=0.8) +
ggtitle("Histogram of % infants lacking measles immunization") +
xlab("% of infants") +
ylab("Count") +
theme_minimal()
Plot % of infants lacking this immunization vs the fatality rate:
plot<-ggplot(data=COVID_4,aes(x=Death_rate,y=Measles_2018,
size=Recovered_percentage,text=Country.Region)) +
geom_point(color="black",fill=rgb(237/255,105/255,37/255),shape=21,alpha=0.6) +
scale_size(range = c(3,15), name="Recovered \n percentage") +
theme_minimal() +
theme(legend.position="bottom") +
labs(title="% infants lacking measles immunization Vs. fatality rate \n and proportion of recovered",
x="Fatality rate (%)",
y="% of infants")
ggplotly(plot,tooltip = c("text"),width=750)
Here the individuals of analysis (tuples) will be each country, the response variable will be the cases per million inhabitants, and the explanatory variables showed before.
Because of the presence of extreme cases per million inhabitants in some countries, it is convenient to make a ln transformation, and because some temperatures are below 0 celcius, they will be converted to kelvin (adding 273.5) to the celcius temperature.
First model:
Mod1<-lm(log(COVID_3$Cases_million)~log(COVID_3$Year_2018)+log(COVID_3$Measles_2018)+
log(COVID_3$Expenditure_2016)+log(COVID_3$Avg_IQ_temperature+273.15))
summary(Mod1)
Call:
lm(formula = log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) +
log(COVID_3$Measles_2018) + log(COVID_3$Expenditure_2016) +
log(COVID_3$Avg_IQ_temperature + 273.15))
Residuals:
Min 1Q Median 3Q Max
-4.1125 -0.8551 -0.1668 0.8243 5.1749
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.85291 20.09897 1.784 0.07662 .
log(COVID_3$Year_2018) 7.62697 0.67683 11.269 < 2e-16 ***
log(COVID_3$Measles_2018) 0.03507 0.12139 0.289 0.77312
log(COVID_3$Expenditure_2016) 0.88946 0.31841 2.793 0.00595 **
log(COVID_3$Avg_IQ_temperature + 273.15) -5.42430 3.53904 -1.533 0.12761
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.391 on 140 degrees of freedom
Multiple R-squared: 0.694, Adjusted R-squared: 0.6853
F-statistic: 79.39 on 4 and 140 DF, p-value: < 2.2e-16
Stepwise with AIC criterion, to see which variables are better:
Mod2<-step(Mod1,direction = "both")
Start: AIC=100.7
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Measles_2018) +
log(COVID_3$Expenditure_2016) + log(COVID_3$Avg_IQ_temperature +
273.15)
Df Sum of Sq RSS AIC
- log(COVID_3$Measles_2018) 1 0.162 271.19 98.782
<none> 271.03 100.695
- log(COVID_3$Avg_IQ_temperature + 273.15) 1 4.548 275.57 101.108
- log(COVID_3$Expenditure_2016) 1 15.106 286.13 106.560
- log(COVID_3$Year_2018) 1 245.830 516.86 192.300
Step: AIC=98.78
log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) + log(COVID_3$Expenditure_2016) +
log(COVID_3$Avg_IQ_temperature + 273.15)
Df Sum of Sq RSS AIC
<none> 271.19 98.782
- log(COVID_3$Avg_IQ_temperature + 273.15) 1 4.474 275.66 99.154
+ log(COVID_3$Measles_2018) 1 0.162 271.03 100.695
- log(COVID_3$Expenditure_2016) 1 15.410 286.60 104.795
- log(COVID_3$Year_2018) 1 301.006 572.19 205.048
Final model:
summary(Mod2)
Call:
lm(formula = log(COVID_3$Cases_million) ~ log(COVID_3$Year_2018) +
log(COVID_3$Expenditure_2016) + log(COVID_3$Avg_IQ_temperature +
273.15))
Residuals:
Min 1Q Median 3Q Max
-4.0920 -0.8525 -0.1514 0.8196 5.1706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.5877 20.0126 1.778 0.07752 .
log(COVID_3$Year_2018) 7.5391 0.6026 12.510 < 2e-16 ***
log(COVID_3$Expenditure_2016) 0.8960 0.3166 2.831 0.00533 **
log(COVID_3$Avg_IQ_temperature + 273.15) -5.3731 3.5231 -1.525 0.12947
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.387 on 141 degrees of freedom
Multiple R-squared: 0.6938, Adjusted R-squared: 0.6873
F-statistic: 106.5 on 3 and 141 DF, p-value: < 2.2e-16
According to the AIC criterion, a model with HDI, % spent on health and average temperature is better:
Normality of residuals:
Model_residuals<-data.frame(Residuals=Mod2$residuals)
ggplot(Model_residuals, aes(Residuals)) +
geom_histogram(fill="dodgerblue4",bins=18,alpha=0.8) +
ggtitle("Histogram of model residuals") +
xlab("Residuals") +
ylab("Frequency") +
theme_minimal()
shapiro.test(Mod2$residuals)
Shapiro-Wilk normality test
data: Mod2$residuals
W = 0.98266, p-value = 0.06388
By analysing the histogram, and using the Shapiro-Wilk test, there is not enough statistical evidence to reject the null hypothesis that the residuals of the model are following a normal distribution. Normal distribution is assumed.
Evaluate if a model with temperature Vs. a one without temperature.
Separate between training and testing:
set.seed(179819)
Sample <- sample(1:length(COVID_3$Cases_million),length(COVID_3$Cases_million)*0.20)
t.testing<- COVID_3[Sample,]
t.training<- COVID_3[-Sample,]
Transform the training and testing variables as before:
t.training<-t.training %>% mutate(Cases_million_log=log(Cases_million),HDI_log=log(Year_2018),
GDP_log=log(Expenditure_2016),
Temperature_log_kelvin=log(Avg_IQ_temperature+273.15))
t.training<-t.training[,14:17]
t.testing<-t.testing %>% mutate(Cases_million_log=log(Cases_million),HDI_log=log(Year_2018),
GDP_log=log(Expenditure_2016),
Temperature_log_kelvin=log(Avg_IQ_temperature+273.15))
t.testing<-t.testing[,14:17]
Fit the same model with training
Mod3<-lm(Cases_million_log~., data=t.training)
summary(Mod3)
Call:
lm(formula = Cases_million_log ~ ., data = t.training)
Residuals:
Min 1Q Median 3Q Max
-4.0745 -0.7898 -0.0155 0.8534 5.1967
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.0107 23.3366 1.972 0.0511 .
HDI_log 7.5427 0.6704 11.250 <2e-16 ***
GDP_log 0.8721 0.3560 2.450 0.0158 *
Temperature_log_kelvin -7.1996 4.1003 -1.756 0.0818 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.404 on 112 degrees of freedom
Multiple R-squared: 0.7062, Adjusted R-squared: 0.6983
F-statistic: 89.72 on 3 and 112 DF, p-value: < 2.2e-16
Error functions:
# Residual Sum of Square (RSS)
RSS<-function(Pred,Actual) {
ss<-sum((Actual-Pred)^2)
return(ss)
}
# Residual Standard Error (RSE)
RSE<-function(Pred,Real,NumPred) {
N<-length(Real)-NumPred-1
ss<-sqrt((1/N)*RSS(Pred,Real))
return(ss)
}
# Mean Squared Error
MSE <- function(Pred,Actual) {
N<-length(Actual)
ss<-(1/N)*RSS(Pred,Actual)
return(ss)
}
# Relative error
RelativeError<-function(Pred,Actual) {
ss<-sum(abs(Actual-Pred))/sum(abs(Actual))
return(ss)
}
Prediction:
Pred<-predict(Mod3,t.testing)
Errors:
RSS_Mod3<-RSS(Pred,t.testing$Cases_million_log)
RSE_Mod3<-RSE(Pred,t.testing$Cases_million_log,dim(t.testing)[2]-1)
MSE_Mod3<-MSE(Pred,t.testing$Cases_million_log)
RelativeError_Mod3<-RelativeError(Pred,t.testing$Cases_million_log)
Mod3_Errors<-c(RSS_Mod3,RSE_Mod3,MSE_Mod3,RelativeError_Mod3)
Now, a model without temperature:
t.training <- t.training %>% select(-Temperature_log_kelvin)
t.testing <- t.testing %>% select(-Temperature_log_kelvin)
Mod4<-lm(Cases_million_log~., data=t.training)
summary(Mod4)
Call:
lm(formula = Cases_million_log ~ ., data = t.training)
Residuals:
Min 1Q Median 3Q Max
-3.9956 -0.7580 -0.0306 0.8623 5.2520
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.0545 0.7316 6.909 3.04e-10 ***
HDI_log 8.1269 0.5874 13.836 < 2e-16 ***
GDP_log 1.0734 0.3401 3.156 0.00205 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.416 on 113 degrees of freedom
Multiple R-squared: 0.6981, Adjusted R-squared: 0.6927
F-statistic: 130.6 on 2 and 113 DF, p-value: < 2.2e-16
Prediction:
Pred<-predict(Mod4,t.testing)
Errors:
RSS_Mod4<-RSS(Pred,t.testing$Cases_million_log)
RSE_Mod4<-RSE(Pred,t.testing$Cases_million_log,dim(t.testing)[2]-1)
MSE_Mod4<-MSE(Pred,t.testing$Cases_million_log)
RelativeError_Mod4<-RelativeError(Pred,t.testing$Cases_million_log)
Mod4_Errors<-c(RSS_Mod4,RSE_Mod4,MSE_Mod4,RelativeError_Mod4)
Create a radarplot to easily compare errors:
Errors<-rbind(Mod3_Errors,Mod4_Errors)
rownames(Errors)<-c("Model with temperature","Model without temperature")
colnames(Errors)<-c("Residual Sum of Square","Residual Standard Error","Mean Squared Error","Relative error")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Error comparison")
legend <-legend(1.5,1, legend=c("With temperature","Without temperature"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Therefore, a model without temperature is better. But because temperature was included using the stepwise AIC criterion, and because its coefficient was significant (at an alpha of 10%), it will be considered for the day-to-day forecast
For each of the following countries, the best suitable ARIMA model will be fitted, and its prediction power will be compared (a model with temperature and a model without)
The weather considered for these models will be the maximum presented and forecasted in AccuWeather
Model with temperature
CostaRica_Temp<-read.csv("CostaRica_Temperature.csv",sep=";")
CostaRica_Temp$Date<-as.Date(CostaRica_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_CostaRica<- COVID_2 %>%
filter(Country.Region %in% c("Costa Rica")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_CostaRica_Temp<-COVID_2_Day_CostaRica %>%
inner_join(CostaRica_Temp,by=c("Date2"="Date"))
COVID_Day_series_CostaRica_Temp<-xts(COVID_2_Day_CostaRica_Temp$World_confirmed, order.by=COVID_2_Day_CostaRica_Temp$Date2)
COVID_series_CostaRica_Temp_Train<-COVID_Day_series_CostaRica_Temp[1:64] #Until March 25th
COVID_series_CostaRica_Temp_Test<-COVID_Day_series_CostaRica_Temp[65:length(COVID_Day_series_CostaRica_Temp)] #From March 26th onwards
Get the differentiated series: In this case until a second difference was suitable
plot(diff(diff(COVID_series_CostaRica_Temp_Train)),type="l",main="Costa Rica: 2nd difference")
Correlograms of second diference:
tsdisplay(diff(diff(COVID_series_CostaRica_Temp_Train)), main="Costa Rica ACF and PACF correlograms")
By analyzing the autocorrelation and partial autocorrelation function plots. A model with 1 autoregressive term, and one moving average term will be fitted Arima model: ARIMA(1,2,1)
ARIMA1_CostaRica<-Arima(COVID_series_CostaRica_Temp_Train,order=c(1,2,1),xreg=COVID_2_Day_CostaRica_Temp$Temperature_CostaRica[1:64])
summary(ARIMA1_CostaRica)
Series: COVID_series_CostaRica_Temp_Train
Regression with ARIMA(1,2,1) errors
Coefficients:
ar1 ma1 xreg
-0.7967 0.4328 -0.2134
s.e. 0.1462 0.1840 0.1159
sigma^2 estimated as 7.227: log likelihood=-147.93
AIC=303.86 AICc=304.57 BIC=312.37
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.4281342 2.581091 1.397242 NaN Inf 0.4379416 -0.009839234
Predictive error functions:
#Relative error
RE <- function(Fore,Actual) {
return(sum(abs(Fore-Actual))/abs(sum(Actual)))
}
#MAPE
MAPE<-function(Fore,Actual){
return(
mean(abs(Actual-Fore)/abs(Actual))*100
)
}
# mean squared error (MSE)
MSE<-function(Fore,Actual) {
N<-length(Actual)
ss<-sum((Actual-Fore)^2)
return((1/N)*ss)
}
#PFA
PFA <- function(Fore,Actual) {
Total<-0
N<-length(Fore)
for(i in 1:N) {
if(Fore[i]>=Actual[i])
Total<-Total+1
}
return(Total/N)
}
Forecast prediction to compare:
Test_CostaRica_Temp<-COVID_2_Day_CostaRica_Temp$Temperature_CostaRica[65:length(COVID_Day_series_CostaRica_Temp)]
P_CRI_1<-forecast(ARIMA1_CostaRica,xreg = Test_CostaRica_Temp)
Calculate errors:
RE_CRI_1<-RE(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MAPE_CRI_1<-MAPE(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MSE_CRI_1<-MSE(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
PFA_CRI_1<-PFA(P_CRI_1$mean,as.vector(COVID_series_CostaRica_Temp_Test))
Errors.CRI_1<-c(RE_CRI_1,MAPE_CRI_1,MSE_CRI_1,PFA_CRI_1)
Errors.CRI_1
[1] 0.03098311 3.43225773 252.77719656 0.45000000
Model without temperature
ARIMA(1,2,0)
ARIMA2_CostaRica<-Arima(COVID_series_CostaRica_Temp_Train,order=c(1,2,0))
summary(ARIMA2_CostaRica)
Series: COVID_series_CostaRica_Temp_Train
ARIMA(1,2,0)
Coefficients:
ar1
-0.4701
s.e. 0.1133
sigma^2 estimated as 7.607: log likelihood=-150.5
AIC=304.99 AICc=305.19 BIC=309.24
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.5145597 2.692615 1.200452 7.86228 21.76863 0.3762612 0.04160954
Forecast prediction to compare
P_CRI_2<-forecast(ARIMA2_CostaRica,h=length(COVID_series_CostaRica_Temp_Test))
Calculate errors:
RE_CRI_2<-RE(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MAPE_CRI_2<-MAPE(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
MSE_CRI_2<-MSE(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
PFA_CRI_2<-PFA(P_CRI_2$mean,as.vector(COVID_series_CostaRica_Temp_Test))
Errors.CRI_2<-c(RE_CRI_2,MAPE_CRI_2,MSE_CRI_2,PFA_CRI_2)
Errors.CRI_2
[1] 0.0291204 3.3461353 227.1872124 0.4000000
Errors<-rbind(Errors.CRI_1,Errors.CRI_2)
rownames(Errors)<-c("ARIMAX(1,2,1)","ARIMA(1,2,0)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Costa Rica forecast: error comparison on test set")
legend <-legend(1.5,1, legend=c("ARIMAX(1,2,1)","ARIMA(1,2,0)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
ARIMA_Final_CostaRica<-Arima(COVID_Day_series_CostaRica_Temp,order=c(0,2,1))
summary(ARIMA_Final_CostaRica)
Series: COVID_Day_series_CostaRica_Temp
ARIMA(0,2,1)
Coefficients:
ma1
-0.3824
s.e. 0.1256
sigma^2 estimated as 20.06: log likelihood=-238.87
AIC=481.75 AICc=481.9 BIC=486.56
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.2023214 4.39795 2.334426 3.912512 11.69981 0.3135232 0.02228771
Final forecast:
Future_CostaRica_Temp<-CostaRica_Temp$Temperature_CostaRica[CostaRica_Temp$Date>max(COVID_2$Date2)]
P_CRI_Final<-forecast(ARIMA_Final_CostaRica,h = length(Future_CostaRica_Temp))
Low_lim_CRI<-data.frame(P_CRI_Final$lower)[,2]
Upp_lim_CRI<-data.frame(P_CRI_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_CostaRica_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_CostaRica_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_CostaRica_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_CRI <- xts(Low_lim_CRI,order.by=per_2)
Forecast_CRI <- xts(P_CRI_Final$mean,order.by=per_2)
Upp_lim_CRI <- xts(Upp_lim_CRI,order.by=per_2)
predNA <- rep(NA, length(Forecast_CRI))
B <- cbind(predNA, Low_lim_CRI, Forecast_CRI, Upp_lim_CRI)
all_series_CRI <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_CRI) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_CRI, main="SARS-COV2-outbreak: Total Costa Rica cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(189/255,44/255,47/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(10/255,44/255,119/255)) %>%
dyRangeSelector()
NA
Model with temperature
Mexico_Temp<-read.csv("Mexico_Temperature.csv",sep=";")
Mexico_Temp$Date<-as.Date(Mexico_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Mexico<- COVID_2 %>%
filter(Country.Region %in% c("Mexico")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Mexico_Temp<-COVID_2_Day_Mexico %>%
inner_join(Mexico_Temp,by=c("Date2"="Date"))
COVID_Day_series_Mexico_Temp<-xts(COVID_2_Day_Mexico_Temp$World_confirmed, order.by=COVID_2_Day_Mexico_Temp$Date2)
COVID_series_Mexico_Train<-COVID_Day_series_Mexico_Temp[1:74] #Until April 4th
COVID_series_Mexico_Test<-COVID_Day_series_Mexico_Temp[75:length(COVID_Day_series_Mexico_Temp)] #From April 5th onwards
Get the differentiated series: In this case until a second difference was suitable
plot(diff(diff(COVID_series_Mexico_Train)),type="l",main="Mexico: 2nd difference")
Correlograms of second diference:
tsdisplay(diff(diff(COVID_series_Mexico_Train)),main="Mexico ACF and PACF correlograms")
By analyzing the autocorrelation and partial autocorrelation function plots. A model with 2 autoregressive terms, and 3 moving average terms will be fitted
Arima model: ARIMA(2,2,3)
ARIMA1_Mexico<-Arima(COVID_series_Mexico_Train,order=c(2,2,3),xreg=COVID_2_Day_Mexico_Temp$Temperature_Mexico[1:74])
summary(ARIMA1_Mexico)
Series: COVID_series_Mexico_Train
Regression with ARIMA(2,2,3) errors
Coefficients:
ar1 ar2 ma1 ma2 ma3 xreg
-0.9027 -0.7333 1.1212 0.8938 0.6734 0.0623
s.e. 0.1121 0.1592 0.1093 0.2027 0.1153 0.2567
sigma^2 estimated as 123.3: log likelihood=-273.92
AIC=561.84 AICc=563.59 BIC=577.78
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 1.716467 10.48796 5.177678 NaN Inf 0.2239162 -0.1047828
Forecast prediction to compare
Test_Mexico_Temp<-COVID_2_Day_Mexico_Temp$Temperature_Mexico[75:length(COVID_Day_series_Mexico_Temp)]
P_MEX_1<-forecast(ARIMA1_Mexico,xreg = Test_Mexico_Temp)
Calculate errors:
RE_MEX_1<-RE(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
MAPE_MEX_1<-MAPE(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
MSE_MEX_1<-MSE(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
PFA_MEX_1<-PFA(P_MEX_1$mean,as.vector(COVID_series_Mexico_Test))
Errors.MEX_1<-c(RE_MEX_1,MAPE_MEX_1,MSE_MEX_1,PFA_MEX_1)
Errors.MEX_1
[1] 2.067523e-01 1.776974e+01 7.367257e+05 0.000000e+00
Model without temperature
ARIMA(2,2,4)
ARIMA2_Mexico<-Arima(COVID_series_Mexico_Train,order=c(2,2,4))
summary(ARIMA2_Mexico)
Series: COVID_series_Mexico_Train
ARIMA(2,2,4)
Coefficients:
ar1 ar2 ma1 ma2 ma3 ma4
-0.7232 -0.7137 0.8661 0.6669 0.4019 -0.3260
s.e. 0.1415 0.1688 0.1697 0.2135 0.1529 0.1463
sigma^2 estimated as 116.5: log likelihood=-272.23
AIC=558.45 AICc=560.2 BIC=574.39
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 2.187346 10.19436 4.88144 4.696633 18.33364 0.2111049 -0.05173888
Forecast prediction to compare
P_MEX_2<-forecast(ARIMA2_Mexico,h=length(COVID_series_Mexico_Test))
Calculate errors:
RE_MEX_2<-RE(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
MAPE_MEX_2<-MAPE(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
MSE_MEX_2<-MSE(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
PFA_MEX_2<-PFA(P_MEX_2$mean,as.vector(COVID_series_Mexico_Test))
Errors.MEX_2<-c(RE_MEX_2,MAPE_MEX_2,MSE_MEX_2,PFA_MEX_2)
Errors.MEX_2
[1] 2.121065e-01 1.822148e+01 7.758630e+05 0.000000e+00
Errors<-rbind(Errors.MEX_1,Errors.MEX_2)
rownames(Errors)<-c("ARIMAX(2,2,3)","ARIMA(2,2,4)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Mexico forecast: error comparison on test set")
legend <-legend(1.5,1, legend=c("ARIMAX(2,2,3)","ARIMA(2,2,4)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model with temperature
Make model with the overall series
ARIMA_Final_Mexico<-Arima(COVID_Day_series_Mexico_Temp,order=c(2,2,1),xreg=COVID_2_Day_Mexico_Temp$Temperature_Mexico)
summary(ARIMA_Final_Mexico)
Series: COVID_Day_series_Mexico_Temp
Regression with ARIMA(2,2,1) errors
Coefficients:
ar1 ar2 ma1 xreg
0.3938 0.5755 -0.8956 -0.1913
s.e. 0.1076 0.1022 0.0648 1.0722
sigma^2 estimated as 693.1: log likelihood=-383.01
AIC=776.01 AICc=776.8 BIC=788.05
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 3.482614 25.36974 11.9289 NaN Inf 0.1974668 -0.02342905
Final forecast:
Future_Mexico_Temp<-Mexico_Temp$Temperature_Mexico[Mexico_Temp$Date>max(COVID_2$Date2)]
P_MEX_Final<-forecast(ARIMA_Final_Mexico,xreg = Future_Mexico_Temp)
Low_lim_MEX<-data.frame(P_MEX_Final$lower)[,2]
Upp_lim_MEX<-data.frame(P_MEX_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Mexico_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Mexico_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Mexico_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_MEX <- xts(Low_lim_MEX,order.by=per_2)
Forecast_MEX <- xts(P_MEX_Final$mean,order.by=per_2)
Upp_lim_MEX <- xts(Upp_lim_MEX,order.by=per_2)
predNA <- rep(NA, length(Forecast_MEX))
B <- cbind(predNA, Low_lim_MEX, Forecast_MEX, Upp_lim_MEX)
all_series_MEX <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_MEX) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_MEX, main="SARS-COV2-outbreak: Total Mexico cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(189/255,44/255,47/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(43/255,102/255,73/255)) %>%
dyRangeSelector()
NA
Model with temperature
Italy_Temp<-read.csv("Italy_Temperature.csv",sep=";")
Italy_Temp$Date<-as.Date(Italy_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Italy<- COVID_2 %>%
filter(Country.Region %in% c("Italy")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Italy_Temp<-COVID_2_Day_Italy %>%
inner_join(Italy_Temp,by=c("Date2"="Date"))
COVID_Day_series_Italy_Temp<-xts(COVID_2_Day_Italy_Temp$World_confirmed, order.by=COVID_2_Day_Italy_Temp$Date2)
COVID_Day_series_Italy_Train<-COVID_Day_series_Italy_Temp[1:64] #Until March 25th
COVID_Day_series_Italy_Test<-COVID_Day_series_Italy_Temp[65:length(COVID_Day_series_Italy_Temp)] #From March 26th onwards
Get the lagged difference:
plot(diff(diff(COVID_Day_series_Italy_Train)),type="l",main="Italy: 2nd difference")
Correlograms of first diference:
tsdisplay(diff(diff(COVID_Day_series_Italy_Train)),main="Italy ACF and PACF correlograms")
By analyzing the autocorrelation and partial autocorrelation function plots. A model with 1 autoregressive term will be fitted.
Arima model: ARIMA(1,2,0)
ARIMA1_Italy<-auto.arima(COVID_Day_series_Italy_Train,
xreg = COVID_2_Day_Italy_Temp$Temperature_Italy[1:64])
ARIMA1_Italy<-Arima(COVID_Day_series_Italy_Train,order=c(1,2,0),xreg=COVID_2_Day_Italy_Temp$Temperature_Italy[1:64])
summary(ARIMA1_Italy)
Series: COVID_Day_series_Italy_Train
Regression with ARIMA(1,2,0) errors
Coefficients:
ar1 xreg
-0.5094 21.4065
s.e. 0.1119 20.2612
sigma^2 estimated as 489396: log likelihood=-493.24
AIC=992.47 AICc=992.89 BIC=998.85
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 123.6808 677.3545 335.2238 NaN Inf 0.2839123 -0.005148955
Forecast prediction to compare
Test_Italy_Temp<-COVID_2_Day_Italy_Temp$Temperature_Italy[65:length(COVID_Day_series_Italy_Temp)]
P_ITA_1<-forecast(ARIMA1_Italy,xreg = Test_Italy_Temp)
Calculate errors:
RE_ITA_1<-RE(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
MAPE_ITA_1<-MAPE(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
MSE_ITA_1<-MSE(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
PFA_ITA_1<-PFA(P_ITA_1$mean,as.vector(COVID_Day_series_Italy_Test))
Errors.ITA_1<-c(RE_ITA_1,MAPE_ITA_1,MSE_ITA_1,PFA_ITA_1)
Errors.ITA_1
[1] 4.619212e-02 4.067575e+00 6.062428e+07 7.500000e-01
Model without temperature
ARIMA(1,2,0)
ARIMA2_Italy<-Arima(COVID_Day_series_Italy_Train,order=c(1,2,0))
summary(ARIMA2_Italy)
Series: COVID_Day_series_Italy_Train
ARIMA(1,2,0)
Coefficients:
ar1
-0.5414
s.e. 0.1046
sigma^2 estimated as 489700: log likelihood=-493.79
AIC=991.58 AICc=991.79 BIC=995.84
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 125.8084 683.1877 295.442 5.335548 11.24576 0.2502197 -0.02348722
Forecast prediction to compare
P_ITA_2<-forecast(ARIMA2_Italy,h=length(COVID_Day_series_Italy_Test))
Calculate errors:
RE_ITA_2<-RE(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
MAPE_ITA_2<-MAPE(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
MSE_ITA_2<-MSE(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
PFA_ITA_2<-PFA(P_ITA_2$mean,as.vector(COVID_Day_series_Italy_Test))
Errors.ITA_2<-c(RE_ITA_2,MAPE_ITA_2,MSE_ITA_2,PFA_ITA_2)
Errors.ITA_2
[1] 4.366177e-02 3.858628e+00 5.437200e+07 7.000000e-01
Errors<-rbind(Errors.ITA_1,Errors.ITA_2)
rownames(Errors)<-c("ARIMAX(1,2,0)","ARIMA(1,2,0)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Italy forecast: error comparison on test set")
legend <-legend(1.5,1, legend=c("ARIMAX(1,2,0)","ARIMA(1,2,0)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
ARIMA_Final_Italy<-Arima(COVID_Day_series_Italy_Temp,order=c(1,2,0))
summary(ARIMA_Final_Italy)
Series: COVID_Day_series_Italy_Temp
ARIMA(1,2,0)
Coefficients:
ar1
-0.4466
s.e. 0.0977
sigma^2 estimated as 492850: log likelihood=-653.39
AIC=1310.77 AICc=1310.93 BIC=1315.59
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 52.1456 689.3822 355.6425 3.725906 8.274914 0.1816647 0.01556872
Final forecast:
Future_Italy_Temp<-Italy_Temp$Temperature_Italy[Italy_Temp$Date>max(COVID_2$Date2)]
P_ITA_Final<-forecast(ARIMA_Final_Italy,h=length(Future_Italy_Temp))
Low_lim_ITA<-data.frame(P_ITA_Final$lower)[,2]
Upp_lim_ITA<-data.frame(P_ITA_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Italy_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Italy_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Italy_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_ITA <- xts(Low_lim_ITA,order.by=per_2)
Forecast_ITA <- xts(P_ITA_Final$mean,order.by=per_2)
Upp_lim_ITA <- xts(Upp_lim_ITA,order.by=per_2)
predNA <- rep(NA, length(Forecast_ITA))
B <- cbind(predNA, Low_lim_ITA, Forecast_ITA, Upp_lim_ITA)
all_series_ITA <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_ITA) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_ITA, main="SARS-COV2-outbreak: Total Italy cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(190/255,59/255,61/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(64/255,143/255,78/255)) %>%
dyRangeSelector()
NA
Model with temperature
Lebanon_Temp<-read.csv("Lebanon_Temperature.csv",sep=";")
Lebanon_Temp$Date<-as.Date(Lebanon_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Lebanon<- COVID_2 %>%
filter(Country.Region %in% c("Lebanon")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Lebanon_Temp<-COVID_2_Day_Lebanon %>%
inner_join(Lebanon_Temp,by=c("Date2"="Date"))
COVID_Day_series_Lebanon_Temp<-xts(COVID_2_Day_Lebanon_Temp$World_confirmed, order.by=COVID_2_Day_Lebanon_Temp$Date2)
COVID_series_Lebanon_Train<-COVID_Day_series_Lebanon_Temp[1:74] #Until April 4th
COVID_series_Lebanon_Test<-COVID_Day_series_Lebanon_Temp[75:length(COVID_Day_series_Lebanon_Temp)] #From April 4th onwards
Get the differentiated series: In this case until a first difference was suitable
plot(diff(COVID_series_Lebanon_Train),type="l",main="Lebanon: 1st difference")
Correlograms of first diference:
tsdisplay(diff(COVID_series_Lebanon_Train),main="Lebanon ACF and PACF correlograms")
Arima model: ARIMA(1,1,2)
ARIMA1_Lebanon<-Arima(COVID_series_Lebanon_Train,order=c(1,1,2),xreg=COVID_2_Day_Lebanon_Temp$Temperature_Lebanon[1:74])
summary(ARIMA1_Lebanon)
Series: COVID_series_Lebanon_Train
Regression with ARIMA(1,1,2) errors
Coefficients:
ar1 ma1 ma2 xreg
0.9216 -0.8504 0.4970 0.0461
s.e. 0.0488 0.0989 0.1263 0.2338
sigma^2 estimated as 60.69: log likelihood=-252.36
AIC=514.72 AICc=515.62 BIC=526.17
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 1.101941 7.522299 3.444682 NaN Inf 0.4835803 -0.03853757
Forecast prediction to compare
Test_Lebanon_Temp<-COVID_2_Day_Lebanon_Temp$Temperature_Lebanon[75:length(COVID_Day_series_Lebanon_Temp)]
P_LBN_1<-forecast(ARIMA1_Lebanon,xreg = Test_Lebanon_Temp)
Calculate errors:
RE_LBN_1<-RE(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
MAPE_LBN_1<-MAPE(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
MSE_LBN_1<-MSE(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
PFA_LBN_1<-PFA(P_LBN_1$mean,as.vector(COVID_series_Lebanon_Test))
Errors.LBN_1<-c(RE_LBN_1,MAPE_LBN_1,MSE_LBN_1,PFA_LBN_1)
Errors.LBN_1
[1] 0.03465457 3.32822038 607.23729449 0.30000000
Model without temperature
ARIMA(0,2,2)
#Auto Arima for Lebanon:
ARIMA2_Lebanon<-Arima(COVID_series_Lebanon_Train,order=c(0,2,2))
summary(ARIMA2_Lebanon)
Series: COVID_series_Lebanon_Train
ARIMA(0,2,2)
Coefficients:
ma1 ma2
-0.8655 0.4654
s.e. 0.0939 0.1264
sigma^2 estimated as 61.55: log likelihood=-249.92
AIC=505.84 AICc=506.19 BIC=512.67
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.286841 7.630167 3.533028 2.334374 15.75454 0.4959827 -0.05944259
Forecast prediction to compare
P_LBN_2<-forecast(ARIMA2_Lebanon,h=length(COVID_series_Lebanon_Test))
Calculate errors:
RE_LBN_2<-RE(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
MAPE_LBN_2<-MAPE(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
MSE_LBN_2<-MSE(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
PFA_LBN_2<-PFA(P_LBN_2$mean,as.vector(COVID_series_Lebanon_Test))
Errors.LBN_2<-c(RE_LBN_2,MAPE_LBN_2,MSE_LBN_2,PFA_LBN_2)
Errors.LBN_2
[1] 0.01137244 1.13881038 55.27593453 0.60000000
Errors<-rbind(Errors.LBN_1,Errors.LBN_2)
rownames(Errors)<-c("ARIMAX(1,1,2)","ARIMA(0,2,2)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Lebanon forecast: error comparison on test set")
legend <-legend(1.5,1, legend=c("ARIMAX(1,1,2)","ARIMA(0,2,2)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
ARIMA_Final_Lebanon<-Arima(COVID_Day_series_Lebanon_Temp,order=c(2,2,0))
summary(ARIMA_Final_Lebanon)
Series: COVID_Day_series_Lebanon_Temp
ARIMA(2,2,0)
Coefficients:
ar1 ar2
-0.9004 -0.2588
s.e. 0.1056 0.1055
sigma^2 estimated as 62.19: log likelihood=-285.11
AIC=576.21 AICc=576.52 BIC=583.43
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.1624741 7.696121 3.788102 2.444889 12.73385 0.4905031 -0.01955309
Final forecast:
Future_Lebanon_Temp<-Lebanon_Temp$Temperature_Lebanon[Lebanon_Temp$Date>max(COVID_2$Date2)]
P_LBN_Final<-forecast(ARIMA_Final_Lebanon,h=length(Future_Lebanon_Temp))
Low_lim_LBN<-data.frame(P_LBN_Final$lower)[,2]
Upp_lim_LBN<-data.frame(P_LBN_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Lebanon_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Lebanon_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Lebanon_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_LBN <- xts(Low_lim_LBN,order.by=per_2)
Forecast_LBN <- xts(P_LBN_Final$mean,order.by=per_2)
Upp_lim_LBN <- xts(Upp_lim_LBN,order.by=per_2)
predNA <- rep(NA, length(Forecast_ITA))
B <- cbind(predNA, Low_lim_LBN, Forecast_LBN, Upp_lim_LBN)
all_series_LBN <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_LBN) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_LBN, main="SARS-COV2-outbreak: Total Lebanon cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(218/255,55/255,50/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(73/255,163/255,90/255)) %>%
dyRangeSelector()
NA
Model with temperature
Colombia_Temp<-read.csv("Colombia_Temperature.csv",sep=";")
Colombia_Temp$Date<-as.Date(Colombia_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Colombia<- COVID_2 %>%
filter(Country.Region %in% c("Colombia")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Colombia_Temp<-COVID_2_Day_Colombia %>%
inner_join(Colombia_Temp,by=c("Date2"="Date"))
COVID_Day_series_Colombia_Temp<-xts(COVID_2_Day_Colombia_Temp$World_confirmed, order.by=COVID_2_Day_Colombia_Temp$Date2)
COVID_series_Colombia_Train<-COVID_Day_series_Colombia_Temp[1:74] #Until April 4th
COVID_series_Colombia_Test<-COVID_Day_series_Colombia_Temp[75:length(COVID_Day_series_Colombia_Temp)] #From April 5th onwards
Get the differentiated series: In this case until a second difference was suitable
plot(diff(diff(COVID_series_Colombia_Train)),type="l",main="Colombia: 2nd difference")
Correlograms of second diference:
tsdisplay(diff(diff(COVID_series_Colombia_Train)), main="Colombia ACF and PACF correlograms")
By analyzing the autocorrelation and partial autocorrelation function plots. A model with 2 autoregressive terms, and 2 moving average terms will be fitted
Arimax model: ARIMAX(2,2,2)
ARIMA1_Colombia<-Arima(COVID_series_Colombia_Train,order=c(2,2,2),xreg=COVID_2_Day_Colombia_Temp$Temperature_Colombia[1:74])
summary(ARIMA1_Colombia)
Series: COVID_series_Colombia_Train
Regression with ARIMA(2,2,2) errors
Coefficients:
ar1 ar2 ma1 ma2 xreg
-0.2675 -0.8894 -0.0039 0.6242 -0.2343
s.e. 0.1037 0.1016 0.1495 0.1732 0.5167
sigma^2 estimated as 217.7: log likelihood=-294.03
AIC=600.06 AICc=601.35 BIC=613.72
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 2.197353 14.04005 6.409043 NaN Inf 0.3327597 -0.07966735
Forecast prediction to compare
Test_Colombia_Temp<-COVID_2_Day_Colombia_Temp$Temperature_Colombia[75:length(COVID_Day_series_Colombia_Temp)]
P_COL_1<-forecast(ARIMA1_Colombia,xreg = Test_Colombia_Temp)
Calculate errors:
RE_COL_1<-RE(P_COL_1$mean,as.vector(COVID_series_Colombia_Test))
MAPE_COL_1<-MAPE(P_COL_1$mean,as.vector(COVID_series_Colombia_Test))
MSE_COL_1<-MSE(P_COL_1$mean,as.vector(COVID_series_Colombia_Test))
PFA_COL_1<-PFA(P_COL_1$mean,as.vector(COVID_series_Colombia_Test))
Errors.COL_1<-c(RE_COL_1,MAPE_COL_1,PFA_COL_1,MSE_COL_1)
Errors.COL_1
[1] 1.038332e-01 9.386524e+00 2.000000e-01 7.935003e+04
Model without temperature
ARIMA(2,2,2)
ARIMA2_Colombia<-Arima(COVID_series_Colombia_Train,order=c(2,2,2))
summary(ARIMA2_Colombia)
Series: COVID_series_Colombia_Train
ARIMA(2,2,2)
Coefficients:
ar1 ar2 ma1 ma2
-0.2672 -0.8913 0.0014 0.6244
s.e. 0.1037 0.1025 0.1486 0.1753
sigma^2 estimated as 215.1: log likelihood=-294.13
AIC=598.26 AICc=599.17 BIC=609.65
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 2.199355 14.05992 6.130841 5.459672 19.97005 0.3183153 -0.08053967
Forecast prediction to compare
P_COL_2<-forecast(ARIMA2_Colombia,h=length(COVID_series_Colombia_Test))
Calculate errors:
RE_COL_2<-RE(P_COL_2$mean,as.vector(COVID_series_Colombia_Test))
MAPE_COL_2<-MAPE(P_COL_2$mean,as.vector(COVID_series_Colombia_Test))
MSE_COL_2<-MSE(P_COL_2$mean,as.vector(COVID_series_Colombia_Test))
PFA_COL_2<-PFA(P_COL_2$mean,as.vector(COVID_series_Colombia_Test))
Errors.COL_2<-c(RE_COL_2,MAPE_COL_2,MSE_COL_2,PFA_COL_2)
Errors.COL_2
[1] 0.103347 9.346102 78591.647234 0.200000
Errors<-rbind(Errors.COL_1,Errors.COL_2)
rownames(Errors)<-c("ARIMAX(2,2,2)","ARIMA(2,2,2)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Colombia forecast: error comparison on test set")
legend <-legend(1.5,1, legend=c("ARIMAX(2,2,2)","ARIMA(2,2,2)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
ARIMA_Final_Colombia<-Arima(COVID_Day_series_Colombia_Temp,order=c(5,2,1))
summary(ARIMA_Final_Colombia)
Series: COVID_Day_series_Colombia_Temp
ARIMA(5,2,1)
Coefficients:
ar1 ar2 ar3 ar4 ar5 ma1
0.1801 -0.2926 0.2750 -0.2965 -0.4795 -0.3580
s.e. 0.1445 0.0999 0.1248 0.1166 0.1178 0.1353
sigma^2 estimated as 603: log likelihood=-377.42
AIC=768.85 AICc=770.36 BIC=785.69
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 4.289777 23.35663 11.44386 5.451225 17.60467 0.3188454 -0.08898061
Final forecast:
Future_Colombia_Temp<-Colombia_Temp$Temperature_Colombia[Colombia_Temp$Date>max(COVID_2$Date2)]
P_COL_Final<-forecast(ARIMA_Final_Colombia,h=length(Future_Colombia_Temp))
Low_lim_COL<-data.frame(P_COL_Final$lower)[,2]
Upp_lim_COL<-data.frame(P_COL_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Colombia_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Colombia_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Colombia_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_COL <- xts(Low_lim_COL,order.by=per_2)
Forecast_COL <- xts(P_COL_Final$mean,order.by=per_2)
Upp_lim_COL <- xts(Upp_lim_COL,order.by=per_2)
predNA <- rep(NA, length(Forecast_COL))
B <- cbind(predNA, Low_lim_COL, Forecast_COL, Upp_lim_COL)
all_series_COL <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_COL) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_COL, main="SARS-COV2-outbreak: Total Colombia cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(17/255,57/255,141/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(246/255,209/255,75/255)) %>%
dyRangeSelector()
NA
Model with temperature
Chile_Temp<-read.csv("Chile_Temperature.csv",sep=";")
Chile_Temp$Date<-as.Date(Chile_Temp$Date,format="%d/%m/%Y")
COVID_2_Day_Chile<- COVID_2 %>%
filter(Country.Region %in% c("Chile")) %>%
group_by(Date2) %>% summarise(World_confirmed=sum(Confirmed))
COVID_2_Day_Chile_Temp<-COVID_2_Day_Chile %>%
inner_join(Chile_Temp,by=c("Date2"="Date"))
COVID_Day_series_Chile_Temp<-xts(COVID_2_Day_Chile_Temp$World_confirmed, order.by=COVID_2_Day_Chile_Temp$Date2)
COVID_series_Chile_Train<-COVID_Day_series_Chile_Temp[1:76] #Until April 6th
COVID_series_Chile_Test<-COVID_Day_series_Chile_Temp[77:length(COVID_Day_series_Chile_Temp)] #From April 7th onwards
Get the differentiated series: In this case a first difference was suitable
plot(diff(COVID_series_Chile_Train),type="l",main="Chile: 1st difference")
Correlograms of first diference:
tsdisplay(diff(COVID_series_Chile_Train),main="Chile ACF and PACF correlograms")
By analyzing the autocorrelation and partial autocorrelation function plots. A model with 1 autoregressive term will be fitted.
Arima model: ARIMA(1,1,0)
ARIMA1_Chile<-Arima(COVID_series_Chile_Train,order=c(1,1,0),xreg=COVID_2_Day_Chile_Temp$Temperature_Chile[1:76])
summary(ARIMA1_Chile)
Series: COVID_series_Chile_Train
Regression with ARIMA(1,1,0) errors
Coefficients:
ar1 xreg
0.9734 -1.3786
s.e. 0.0265 1.2712
sigma^2 estimated as 1708: log likelihood=-386
AIC=778.01 AICc=778.35 BIC=784.96
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 6.143615 40.50828 20.18216 NaN Inf 0.3143638 -0.5175849
Forecast prediction to compare
Test_Chile_Temp<-COVID_2_Day_Chile_Temp$Temperature_Chile[77:length(COVID_Day_series_Chile_Temp)]
P_CHL_1<-forecast(ARIMA1_Chile,xreg = Test_Chile_Temp)
Calculate errors:
RE_CHL_1<-RE(P_CHL_1$mean,as.vector(COVID_series_Chile_Test))
MAPE_CHL_1<-MAPE(P_CHL_1$mean,as.vector(COVID_series_Chile_Test))
MSE_CHL_1<-MSE(P_CHL_1$mean,as.vector(COVID_series_Chile_Test))
PFA_CHL_1<-PFA(P_CHL_1$mean,as.vector(COVID_series_Chile_Test))
Errors.CHL_1<-c(RE_CHL_1,MAPE_CHL_1,MSE_CHL_1,PFA_CHL_1)
Errors.CHL_1
[1] 5.184414e-02 4.820284e+00 1.619636e+05 1.250000e-01
Model without temperature
ARIMA(1,2,4)
ARIMA2_Chile<-Arima(COVID_series_Chile_Train,order=c(1,2,4))
summary(ARIMA2_Chile)
Series: COVID_series_Chile_Train
ARIMA(1,2,4)
Coefficients:
ar1 ma1 ma2 ma3 ma4
0.8988 -1.8812 1.0871 0.0198 -0.0671
s.e. 0.0654 0.1487 0.2748 0.2611 0.1410
sigma^2 estimated as 843.5: log likelihood=-354.47
AIC=720.93 AICc=722.19 BIC=734.76
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 2.726442 27.67343 12.61158 8.098567 15.1208 0.196442 -0.02684075
Forecast prediction to compare
P_CHL_2<-forecast(ARIMA2_Chile,h=length(COVID_series_Chile_Test))
Calculate errors:
RE_CHL_2<-RE(P_CHL_2$mean,as.vector(COVID_series_Chile_Test))
MAPE_CHL_2<-MAPE(P_CHL_2$mean,as.vector(COVID_series_Chile_Test))
MSE_CHL_2<-MSE(P_CHL_2$mean,as.vector(COVID_series_Chile_Test))
PFA_CHL_2<-PFA(P_CHL_2$mean,as.vector(COVID_series_Chile_Test))
Errors.CHL_2<-c(RE_CHL_2,MAPE_CHL_2,MSE_CHL_2,PFA_CHL_2)
Errors.CHL_2
[1] 2.967803e-02 2.799603e+00 5.188357e+04 1.250000e-01
Errors<-rbind(Errors.CHL_1,Errors.CHL_2)
rownames(Errors)<-c("ARIMAX(1,1,0)","ARIMA(1,2,4)")
colnames(Errors)<-c("Relative error","Mean Abs % error","Mean Squared Error","PFA")
Errors<-as.data.frame(Errors)
maximum<-apply(Errors,2,max)
minimum<-apply(Errors,2,min)
Errors<-rbind(minimum,Errors)
Errors<-rbind(maximum,Errors)
radarchart(Errors,maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8,cglcol="gray67",
pcol=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"),
plty=1,
plwd=3,
title="Chile forecast: error comparison on test set")
legend <-legend(1.5,1, legend=c("ARIMAX(1,1,0)","ARIMA(1,2,4)"),
seg.len=-1.4,
title="Errors",
pch=21,
bty="n" ,lwd=3, y.intersp=1, horiz=FALSE,
col=c("dodgerblue2","firebrick2","darkorange2","darkorchid2"))
Conclusion: Keep model without temperature
Make model with the overall series
ARIMA_Final_Chile<-Arima(COVID_Day_series_Chile_Temp,order=c(0,2,1))
summary(ARIMA_Final_Chile)
Series: COVID_Day_series_Chile_Temp
ARIMA(0,2,1)
Coefficients:
ma1
-0.4633
s.e. 0.0934
sigma^2 estimated as 1978: log likelihood=-427.15
AIC=858.3 AICc=858.46 BIC=863.12
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 8.103709 43.6722 20.75474 5.336459 15.41471 0.2175879 -0.02280619
Final forecast:
Future_Chile_Temp<-Chile_Temp$Temperature_Chile[Chile_Temp$Date>max(COVID_2$Date2)]
P_CHL_Final<-forecast(ARIMA_Final_Chile,h=length(Future_Chile_Temp))
Low_lim_CHL<-data.frame(P_CHL_Final$lower)[,2]
Upp_lim_CHL<-data.frame(P_CHL_Final$upper)[,2]
For making the plot:
##Data periods
per_1 <- as.Date(as.character(COVID_2_Day_Chile_Temp$Date2))
per_2 <- seq(as.Date(max(COVID_2_Day_Chile_Temp$Date2)+1,format="%Y-%m-%d"), as.Date("2020-04-30",format="%Y-%m-%d"),"1 day")
# Merge forecast + actuals
data <- xts(COVID_Day_series_Chile_Temp,order.by=per_1)
dataNA <- rep(NA, length(data))
A <- cbind(data,dataNA,dataNA,dataNA)
Low_lim_CHL <- xts(Low_lim_CHL,order.by=per_2)
Forecast_CHL <- xts(P_CHL_Final$mean,order.by=per_2)
Upp_lim_CHL <- xts(Upp_lim_CHL,order.by=per_2)
predNA <- rep(NA, length(Forecast_CHL))
B <- cbind(predNA, Low_lim_CHL, Forecast_CHL, Upp_lim_CHL)
all_series_CHL <- data.frame(rbind(as.matrix(A),as.matrix(B)))
colnames(all_series_CHL) <- c('Actual', 'Lower_limit', 'Forecast', 'Upper_limit')
dygraph(all_series_CHL, main="SARS-COV2-outbreak: Total Chile cases",xlab="Date", ylab="Novel coronavirus cases",width = 750)%>%
dySeries(c('Lower_limit', 'Forecast', 'Upper_limit'),label="Forecast",strokeWidth=2,
drawPoints = TRUE, pointSize = 2, color=rgb(196/255,60/255,44/255)) %>%
dySeries("Actual",drawPoints = TRUE, strokeWidth=2, pointSize = 2,
color=rgb(16/255,59/255,160/255)) %>%
dyRangeSelector()
NA